IE 582 HOMEWORK 2
#gizem kurtbay- code for hw 1 of ie582
The packages are installed for one time and required after that.
#Task 1
#install.packages("data.table")
#install.packages("anytime")
require(data.table)
## Loading required package: data.table
require(anytime)
## Loading required package: anytime
The data are taken into R.
matches_file_path='C:/Users/Gizem/Downloads/ie582icin/df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds'
odd_details_file_path='C:/Users/Gizem/Downloads/ie582icin/df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds'
The data is prepared for further calculations.
matches=readRDS(matches_file_path)
odds=readRDS(odd_details_file_path)
str(matches)
## Classes 'data.table' and 'data.frame': 3122 obs. of 7 variables:
## $ leagueId: chr "df9b1196-e3cf-4cc7-9159-f236fe738215" "df9b1196-e3cf-4cc7-9159-f236fe738215" "df9b1196-e3cf-4cc7-9159-f236fe738215" "df9b1196-e3cf-4cc7-9159-f236fe738215" ...
## $ matchId : chr "KjF6FiA6" "ILVbJgQm" "SGIEDVvJ" "YwL5xFHJ" ...
## $ home : chr "tottenham" "aston villa" "wolves" "bolton" ...
## $ away : chr "manchester city" "west ham" "stoke city" "fulham" ...
## $ score : chr "0:0" "3:0" "2:1" "0:0" ...
## $ date : num 1.28e+09 1.28e+09 1.28e+09 1.28e+09 1.28e+09 ...
## $ type : chr "soccer" "soccer" "soccer" "soccer" ...
## - attr(*, ".internal.selfref")=<externalptr>
matches=unique(matches)
zero_zero_matches=matches[score=='0:0']
#transform unix time to date
matches[,match_date:=anydate(date)]
#transform unix time to date time
matches[,match_time:=anytime(date)]
#order by home team and match date (decreasing)
matches=matches[order(home,-match_time)]
#only one column deletion
matches[,date:=NULL]
#only delete any number of columns
matches[,c("match_date","date"):=NULL]
## Warning in `[.data.table`(matches, , `:=`(c("match_date", "date"), NULL)):
## Adding new column 'date' then assigning NULL (deleting it).
#matches[,3,with=F]
matches[,c("HomeGoals","AwayGoals"):=tstrsplit(score,':')]
matches[,Year:=year(match_time)]
matches[,Month:=month(match_time)]
matches[,Weekday:=wday(match_time)]
matches[,Hour:=hour(match_time)]
#transform characters to numeric for scores
matches$HomeGoals=as.numeric(matches$HomeGoals)
matches[,AwayGoals:=as.numeric(AwayGoals)]
#calculate total goals
matches[,TotalGoals:=HomeGoals+AwayGoals]
# mark over under
matches[,IsOver:=0]
matches[TotalGoals>2,IsOver:=1]
#alternative
matches[,IsOverAlt:=as.numeric(TotalGoals>2)]
#filter na scores
matches[is.na(score)]
## leagueId matchId home
## 1: df9b1196-e3cf-4cc7-9159-f236fe738215 4tkXfsT9 fulham
## 2: df9b1196-e3cf-4cc7-9159-f236fe738215 zTB4xZCr liverpool
## 3: df9b1196-e3cf-4cc7-9159-f236fe738215 Wf5DzDse southampton
## away score type match_time HomeGoals AwayGoals
## 1: arsenal <NA> soccer 2018-10-07 15:00:00 NA NA
## 2: manchester-city <NA> soccer 2018-10-07 19:30:00 NA NA
## 3: chelsea <NA> soccer 2018-10-07 17:15:00 NA NA
## Year Month Weekday Hour TotalGoals IsOver IsOverAlt
## 1: 2018 10 1 15 NA 0 NA
## 2: 2018 10 1 19 NA 0 NA
## 3: 2018 10 1 17 NA 0 NA
#filter all NAs
matches=matches[complete.cases(matches)]
#yearly average goals
yearly_goals=matches[,list(AvgGoals=mean(TotalGoals)),by=list(Year)]
#yearly hourly average goals
yearly_hourly_goals=matches[,list(AvgGoals=mean(TotalGoals),
MaxGoals=max(TotalGoals),
NGames=.N, AltNGames=length(matchId)),
by=list(Year,Hour)]
#get the game with the max total goals
max_game=matches[,list(MaxGoals=max(TotalGoals),
homeMax=home[which.max(TotalGoals)],
awayMax=away[which.max(TotalGoals)]),
by=list(Year,Hour)]
#get the game with the max total goals for Year greater than 2017
max_game=matches[Year>2017,list(MaxGoals=max(TotalGoals),
homeMax=home[which.max(TotalGoals)],
awayMax=away[which.max(TotalGoals)]),
by=list(Year,Hour)]
#filter over under 2.5
odds_ov_un=odds[betType=='ou' & totalhandicap=='2.5']
#remove total handicap
odds_ov_un[,totalhandicap:=NULL]
#remove original odds data
#rm(odds); gc();
#order data in ascending date
odds_ov_un=odds_ov_un[order(matchId, oddtype,bookmaker,date)]
odds_ov_un_initial=odds_ov_un[,list(start_odd=odd[1]),
by=list(matchId,oddtype,bookmaker)]
odds_ov_un_final=odds_ov_un[,list(final_odd=odd[.N]),
by=list(matchId,oddtype,bookmaker)]
#transform to wide format
wide_odds_initial=dcast(odds_ov_un_initial,
matchId~oddtype+bookmaker,
value.var='start_odd')
#transform to long
#long_ov_un_initial=melt(wide_odds_initial,id.vars=1,
# measure.vars=2:ncol(wide_odds_initial))
Until now, the same work was also done for first homework.
TASK 1
Five bookmakers are chosen to check if over/under 2.5 game result can be explained by the odds for different types of bets.
#TASK 1 PART a
#SELECT 5 BOOKMAKERS
The coding for pinnacle;
# for pinnacle;
#first we create the pinnacle odds with data
pinnacle_odds=odds[bookmaker=='Pinnacle']
pinnacle_odds[,totalhandicap:=NULL]
pinnacle_odds=pinnacle_odds[order(matchId,betType,oddtype,bookmaker,date)]
pinnacle_odds=pinnacle_odds[order(betType)]
#then we create initial and final odds of pinnacle
#initial odds are not needed, final odds will be used
#final odds
pinnacle_odds_final=pinnacle_odds[,list(final_odd=odd[.N]), by=list(matchId,betType,oddtype,betType,bookmaker)]
pinnacle_odds_final[,betType.1:=NULL]
#transform to wide format
pinnacle_odds_final=dcast(pinnacle_odds_final,
matchId~betType+oddtype,
value.var='final_odd')
#merge the matches with odds
merged_pinnacle_odds_final=merge(matches, pinnacle_odds_final ,
by='matchId')
#then we fill the NAs
merged_pinnacle_odds_final=merged_pinnacle_odds_final[complete.cases(merged_pinnacle_odds_final)]
#we eliminate unnecessary columns
merged_pinnacle_odds_final_less=merged_pinnacle_odds_final[,15:23]
merged_pinnacle_odds_final_less=merged_pinnacle_odds_final_less[order(IsOver)]
#first 2 columns are not variables,so we extract them
vars_merged_pinnacle_odds_final=merged_pinnacle_odds_final_less[,3:9]
#there are 7 variables for PCA calculaions, therefore n=7 dimensions
#Then we scale these, and they are centered
a=scale(vars_merged_pinnacle_odds_final, center = TRUE, scale = TRUE)
# for PCA (Principal Component Analysis)
b=cor(vars_merged_pinnacle_odds_final)
# 1x2_odd1 means home odd, 1x2_odd2 means away odd
#and 1x2_oddx means tie odd in here.
pca_pinnacle = princomp(vars_merged_pinnacle_odds_final,scores = TRUE,cor = TRUE)
summary(pca_pinnacle)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.4477015 1.3548979 1.1399700 0.9289912 0.80666268
## Proportion of Variance 0.2994057 0.2622498 0.1856474 0.1232892 0.09295781
## Cumulative Proportion 0.2994057 0.5616554 0.7473028 0.8705921 0.96354987
## Comp.6 Comp.7
## Standard deviation 0.47501568 0.171787621
## Proportion of Variance 0.03223427 0.004215855
## Cumulative Proportion 0.99578414 1.000000000
#loadings are basically eigenvectors
loadings(pca_pinnacle)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## 1x2_odd1 0.394 0.219 0.811 0.354
## 1x2_odd2 -0.666 0.199 0.717
## 1x2_oddX -0.564 0.295 0.471 -0.601
## ah_1 0.142 0.420 0.471 -0.292 0.614 0.346
## ah_2 -0.102 -0.485 -0.463 0.187 0.538 0.463
## ou_over 0.167 -0.555 0.390 0.384 -0.603
## ou_under -0.156 0.471 -0.525 0.420 -0.549
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.143 0.143 0.143 0.143 0.143 0.143 0.143
## Cumulative Var 0.143 0.286 0.429 0.571 0.714 0.857 1.000
plot(pca_pinnacle)
screeplot(pca_pinnacle,type = "line", main = "Screen plot of pca of pinnacle")
The loading results can be seen here. The weights of each variable for all components are shown. It can be seen that component 1 has the highest varience, and component 2 is next. Higher varience means the data fits better with the line. So component 1 and component 2 are chosen by PCA.
plot(pca_pinnacle$scores,col=1+merged_pinnacle_odds_final_less$IsOver)
#scores are the mapped points in this plot
```
The plot shows that the data is not linearly seperable. But the data has some shape formation similar to shape of U. The data is mostly around o of the x and y axises. The data is not all together around the U shape, some of the data is spreaded aroud.
The coding for bet365;
# for bet365;
#first we create the bet365 odds with data
bet365_odds=odds[bookmaker=='bet365']
bet365_odds[,totalhandicap:=NULL]
bet365_odds=bet365_odds[order(matchId,betType,oddtype,bookmaker,date)]
bet365_odds=bet365_odds[order(betType)]
#then we create initial and final odds of bet365
#initial odds are not needed, final odds will be used
#final odds
bet365_odds_final=bet365_odds[,list(final_odd=odd[.N]), by=list(matchId,betType,oddtype,betType,bookmaker)]
bet365_odds_final[,betType.1:=NULL]
#transform to wide format
bet365_odds_final=dcast(bet365_odds_final,
matchId~betType+oddtype,
value.var='final_odd')
#merge the matches with odds
merged_bet365_odds_final=merge(matches, bet365_odds_final ,
by='matchId')
#then we fill the NAs
merged_bet365_odds_final=merged_bet365_odds_final[complete.cases(merged_bet365_odds_final)]
#we eliminate unnecessary columns
merged_bet365_odds_final_less=merged_bet365_odds_final[,15:30]
merged_bet365_odds_final_less=merged_bet365_odds_final_less[order(IsOver)]
#first 2 columns are not variables,so we extract them
vars_merged_bet365_odds_final=merged_bet365_odds_final_less[,3:16]
#there are 14 variables for PCA calculaions, therefore n=14 dimensions
#Then we scale these, and they are centered
c=scale(vars_merged_bet365_odds_final, center = TRUE, scale = TRUE)
# for PCA (Principal Component Analysis)
d=cor(vars_merged_bet365_odds_final)
# 1x2_odd1 means home odd, 1x2_odd2 means away odd
#and 1x2_oddx means tie odd in here.
pca_bet365 = princomp(vars_merged_bet365_odds_final,scores = TRUE,cor = TRUE)
summary(pca_bet365)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.3579691 1.6724840 1.2480902 1.2131662 1.07932053
## Proportion of Variance 0.3971441 0.1998002 0.1112664 0.1051266 0.08320949
## Cumulative Proportion 0.3971441 0.5969443 0.7082107 0.8133373 0.89654676
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.88341208 0.67071252 0.340973035 0.233805924
## Proportion of Variance 0.05574406 0.03213252 0.008304472 0.003904658
## Cumulative Proportion 0.95229083 0.98442335 0.992727818 0.996632476
## Comp.10 Comp.11 Comp.12 Comp.13
## Standard deviation 0.141195425 0.124092325 0.0753974037 0.0642538741
## Proportion of Variance 0.001424011 0.001099922 0.0004060549 0.0002948972
## Cumulative Proportion 0.998056487 0.999156409 0.9995624635 0.9998573607
## Comp.14
## Standard deviation 0.0446872524
## Proportion of Variance 0.0001426393
## Cumulative Proportion 1.0000000000
#loadings are basically eigenvectors
loadings(pca_bet365)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1x2_odd1 0.256 0.474 0.155
## 1x2_odd2 -0.417 0.247
## 1x2_oddX -0.329 0.323 0.151 0.173 0.257 0.165
## ah_1 -0.617 0.344 0.703
## ah_2 0.576 -0.399 0.708
## bts_NO 0.205 -0.188 0.320 0.564 0.166 -0.674
## bts_YES -0.232 0.205 -0.315 -0.514 -0.704
## dc_12 0.283 -0.344 -0.185 -0.231 0.825
## dc_1X 0.264 0.464 0.100
## dc_X2 -0.417 0.154
## ha_1 0.233 0.495 0.153
## ha_2 -0.413 0.288
## ou_over 0.748 -0.659
## ou_under 0.193 -0.638 -0.733
## Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## 1x2_odd1 0.177 0.122 0.717 0.354
## 1x2_odd2 0.385 0.684 -0.272 0.246
## 1x2_oddX -0.695 -0.122 -0.133 0.340
## ah_1
## ah_2
## bts_NO
## bts_YES -0.173
## dc_12
## dc_1X -0.109 0.566 0.255 -0.195 -0.517
## dc_X2 -0.391 0.126 0.420 -0.650
## ha_1 0.270 -0.598 -0.246 -0.422
## ha_2 0.448 0.403 -0.600
## ou_over
## ou_under
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.071 0.071 0.071 0.071 0.071 0.071 0.071 0.071
## Cumulative Var 0.071 0.143 0.214 0.286 0.357 0.429 0.500 0.571
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.071 0.071 0.071 0.071 0.071 0.071
## Cumulative Var 0.643 0.714 0.786 0.857 0.929 1.000
plot(pca_bet365)
screeplot(pca_bet365,type = "line", main = "Screen plot of pca of bet365")
The loading results can be seen here. The weights of each variable for all components are shown. It can be seen that component 1 has the highest varience, and component 2 is next. Higher varience means the data fits better with the line. So component 1 and component 2 are chosen by PCA.
plot(pca_bet365$scores,col=1+merged_bet365_odds_final_less$IsOver)
#scores are the mapped points in this plot
The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V.
The coding for Betway;
#first we create the Betway odds with data
Betway_odds=odds[bookmaker=='Betway']
Betway_odds[,totalhandicap:=NULL]
Betway_odds=Betway_odds[order(matchId,betType,oddtype,bookmaker,date)]
Betway_odds=Betway_odds[order(betType)]
#then we create initial and final odds of Betway
#initial odds are not needed, final odds will be used
#final odds
Betway_odds_final=Betway_odds[,list(final_odd=odd[.N]), by=list(matchId,betType,oddtype,betType,bookmaker)]
Betway_odds_final[,betType.1:=NULL]
#transform to wide format
Betway_odds_final=dcast(Betway_odds_final,
matchId~betType+oddtype,
value.var='final_odd')
#merge the matches with odds
merged_Betway_odds_final=merge(matches, Betway_odds_final ,
by='matchId')
#then we fill the NAs
merged_Betway_odds_final=merged_Betway_odds_final[complete.cases(merged_Betway_odds_final)]
#we eliminate unnecessary columns
merged_Betway_odds_final_less=merged_Betway_odds_final[,15:28]
merged_Betway_odds_final_less=merged_Betway_odds_final_less[order(IsOver)]
#first 2 columns are not variables,so we extract them
vars_merged_Betway_odds_final=merged_Betway_odds_final_less[,3:14]
#there are 12 variables for PCA calculaions, therefore n=12 dimensions
#Then we scale these, and they are centered
e=scale(vars_merged_Betway_odds_final, center = TRUE, scale = TRUE)
# for PCA (Principal Component Analysis)
f=cor(vars_merged_Betway_odds_final)
# 1x2_odd1 means home odd, 1x2_odd2 means away odd
#and 1x2_oddx means tie odd in here.
pca_Betway = princomp(vars_merged_Betway_odds_final,scores = TRUE,cor = TRUE)
summary(pca_Betway)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.3767670 1.6355313 1.2784101 1.07751639 0.79676329
## Proportion of Variance 0.4707518 0.2229136 0.1361944 0.09675346 0.05290265
## Cumulative Proportion 0.4707518 0.6936653 0.8298597 0.92661315 0.97951579
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.36428523 0.200360210 0.189429195 0.130446513
## Proportion of Variance 0.01105864 0.003345351 0.002990285 0.001418024
## Cumulative Proportion 0.99057444 0.993919790 0.996910075 0.998328099
## Comp.10 Comp.11 Comp.12
## Standard deviation 0.0900375004 0.0866625553 0.0666757569
## Proportion of Variance 0.0006755626 0.0006258665 0.0003704714
## Cumulative Proportion 0.9990036621 0.9996295286 1.0000000000
#loadings are basically eigenvectors
loadings(pca_Betway)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1x2_odd1 0.274 0.460 0.113 0.155 0.174
## 1x2_odd2 -0.410 0.325 -0.177 -0.794
## 1x2_oddX -0.310 0.348 0.170 0.189 0.483 -0.404 -0.133 0.431
## bts_NO 0.202 -0.216 0.542 0.347 0.151 0.516 -0.427 0.141
## bts_YES -0.226 0.233 -0.507 -0.325 0.438 -0.533 0.221
## dc_12 0.273 -0.358 -0.236 -0.228 0.764 0.141 0.265
## dc_1X 0.284 0.447
## dc_X2 -0.413 0.142 0.161 0.131
## ha_1 0.266 0.470 0.138 -0.272
## ha_2 -0.410 0.553 0.589 0.144
## ou_over -0.465 0.475 0.740
## ou_under 0.361 -0.661 0.656
## Comp.10 Comp.11 Comp.12
## 1x2_odd1 0.703 0.384
## 1x2_odd2 0.210
## 1x2_oddX -0.170 0.105 0.276
## bts_NO
## bts_YES
## dc_12
## dc_1X -0.469 0.136 -0.679
## dc_X2 0.406 -0.580 -0.495
## ha_1 -0.147 -0.631 0.429
## ha_2 -0.260 0.196 0.173
## ou_over
## ou_under
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.083 0.083 0.083 0.083 0.083 0.083 0.083 0.083
## Cumulative Var 0.083 0.167 0.250 0.333 0.417 0.500 0.583 0.667
## Comp.9 Comp.10 Comp.11 Comp.12
## SS loadings 1.000 1.000 1.000 1.000
## Proportion Var 0.083 0.083 0.083 0.083
## Cumulative Var 0.750 0.833 0.917 1.000
plot(pca_Betway)
screeplot(pca_Betway,type = "line", main = "Screen plot of pca of Betway")
plot(pca_Betway$scores,col=1+merged_Betway_odds_final_less$IsOver)
#scores are the mapped points in this plot
The loading results can be seen here. The weights of each variable for all components are shown. It can be seen that component 1 has the highest varience, and component 2 is next. Higher varience means the data fits better with the line. So component 1 and component 2 are chosen by PCA. The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V.
The coding for Betsson;
#first we create the Betsson odds with data
Betsson_odds=odds[bookmaker=='Betsson']
Betsson_odds[,totalhandicap:=NULL]
Betsson_odds=Betsson_odds[order(matchId,betType,oddtype,bookmaker,date)]
Betsson_odds=Betsson_odds[order(betType)]
#then we create initial and final odds of Betsson
#initial odds are not needed, final odds will be used
#final odds
Betsson_odds_final=Betsson_odds[,list(final_odd=odd[.N]), by=list(matchId,betType,oddtype,betType,bookmaker)]
Betsson_odds_final[,betType.1:=NULL]
#transform to wide format
Betsson_odds_final=dcast(Betsson_odds_final,
matchId~betType+oddtype,
value.var='final_odd')
#merge the matches with odds
merged_Betsson_odds_final=merge(matches, Betsson_odds_final ,
by='matchId')
#then we fill the NAs
merged_Betsson_odds_final=merged_Betsson_odds_final[complete.cases(merged_Betsson_odds_final)]
#we eliminate unnecessary columns
merged_Betsson_odds_final_less=merged_Betsson_odds_final[,15:30]
merged_Betsson_odds_final_less=merged_Betsson_odds_final_less[order(IsOver)]
#first 2 columns are not variables,so we extract them
vars_merged_Betsson_odds_final=merged_Betsson_odds_final_less[,3:16]
#there are 14 variables for PCA calculaions, therefore n=14 dimensions
#Then we scale these, and they are centered
g=scale(vars_merged_Betsson_odds_final, center = TRUE, scale = TRUE)
# for PCA (Principal Component Analysis)
h=cor(vars_merged_Betsson_odds_final)
# 1x2_odd1 means home odd, 1x2_odd2 means away odd
#and 1x2_oddx means tie odd in here.
pca_Betsson = princomp(vars_merged_Betsson_odds_final,scores = TRUE,cor = TRUE)
summary(pca_Betsson)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.2232388 1.7491279 1.4593378 1.1950556 1.07959841
## Proportion of Variance 0.3530565 0.2185320 0.1521191 0.1020113 0.08325234
## Cumulative Proportion 0.3530565 0.5715885 0.7237076 0.8257188 0.90897118
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.87340692 0.64878421 0.190092638 0.177350722
## Proportion of Variance 0.05448855 0.03006578 0.002581086 0.002246663
## Cumulative Proportion 0.96345972 0.99352550 0.996106590 0.998353253
## Comp.10 Comp.11 Comp.12 Comp.13
## Standard deviation 0.128453994 0.0684090668 3.234394e-02 2.265327e-02
## Proportion of Variance 0.001178602 0.0003342715 7.472362e-05 3.665505e-05
## Cumulative Proportion 0.999531855 0.9998661267 9.999409e-01 9.999775e-01
## Comp.14
## Standard deviation 1.774610e-02
## Proportion of Variance 2.249458e-05
## Cumulative Proportion 1.000000e+00
#loadings are basically eigenvectors
loadings(pca_Betsson)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1x2_odd1 0.389 0.245 0.154
## 1x2_odd2 -0.421 0.186 0.108
## 1x2_oddX 0.550 0.561
## ah_1 -0.281 0.633 0.167 0.695
## ah_2 0.372 -0.557 -0.180 -0.127 0.707
## bts_NO 0.155 -0.585 -0.315 0.706
## bts_YES -0.164 0.586 0.305 0.680
## dc_12 -0.559 0.778
## dc_1X 0.392 0.257 0.111 0.124
## dc_X2 -0.420 0.202 0.207
## ha_1 0.380 0.270 0.145
## ha_2 -0.414 0.212
## ou_over 0.157 -0.732 0.659
## ou_under 0.156 -0.201 0.625 0.730
## Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## 1x2_odd1 0.335 0.373 0.395 0.576
## 1x2_odd2 0.470 -0.234 -0.637 -0.160 0.251
## 1x2_oddX -0.490 -0.117 -0.236 0.223
## ah_1
## ah_2
## bts_NO
## bts_YES -0.218
## dc_12 0.254
## dc_1X 0.128 -0.603 -0.104 0.427 -0.404
## dc_X2 0.520 -0.183 0.465 -0.464
## ha_1 0.309 0.343 -0.600 -0.416
## ha_2 0.448 -0.164 0.734
## ou_over
## ou_under
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.071 0.071 0.071 0.071 0.071 0.071 0.071 0.071
## Cumulative Var 0.071 0.143 0.214 0.286 0.357 0.429 0.500 0.571
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.071 0.071 0.071 0.071 0.071 0.071
## Cumulative Var 0.643 0.714 0.786 0.857 0.929 1.000
plot(pca_Betsson)
screeplot(pca_Betsson,type = "line", main = "Screen plot of pca of Betsson")
plot(pca_Betsson$scores,col=1+merged_Betsson_odds_final_less$IsOver)
#scores are the mapped points in this plot
The loading results can be seen here. The weights of each variable for all components are shown. It can be seen that component 1 has the highest varience, and component 2 is next. Higher varience means the data fits better with the line. So component 1 and component 2 are chosen by PCA. The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V. There less data points compared to previous bookmakers.
The coding for Betsafe;
#first we create the Betsafe odds with data
Betsafe_odds=odds[bookmaker=='Betsafe']
Betsafe_odds[,totalhandicap:=NULL]
Betsafe_odds=Betsafe_odds[order(matchId,betType,oddtype,bookmaker,date)]
Betsafe_odds=Betsafe_odds[order(betType)]
#then we create initial and final odds of Betsafe
#initial odds are not needed, final odds will be used
#final odds
Betsafe_odds_final=Betsafe_odds[,list(final_odd=odd[.N]), by=list(matchId,betType,oddtype,betType,bookmaker)]
Betsafe_odds_final[,betType.1:=NULL]
#transform to wide format
Betsafe_odds_final=dcast(Betsafe_odds_final,
matchId~betType+oddtype,
value.var='final_odd')
#merge the matches with odds
merged_Betsafe_odds_final=merge(matches, Betsafe_odds_final ,
by='matchId')
#then we fill the NAs
merged_Betsafe_odds_final=merged_Betsafe_odds_final[complete.cases(merged_Betsafe_odds_final)]
#we eliminate unnecessary columns
merged_Betsafe_odds_final_less=merged_Betsafe_odds_final[,15:30]
merged_Betsafe_odds_final_less=merged_Betsafe_odds_final_less[order(IsOver)]
#first 2 columns are not variables,so we extract them
vars_merged_Betsafe_odds_final=merged_Betsafe_odds_final_less[,3:16]
#there are 14 variables for PCA calculaions, therefore n=14 dimensions
#Then we scale these, and they are centered
m=scale(vars_merged_Betsafe_odds_final, center = TRUE, scale = TRUE)
# for PCA (Principal Component Analysis)
n=cor(vars_merged_Betsafe_odds_final)
# 1x2_odd1 means home odd, 1x2_odd2 means away odd
#and 1x2_oddx means tie odd in here.
pca_Betsafe = princomp(vars_merged_Betsafe_odds_final,scores = TRUE,cor = TRUE)
summary(pca_Betsafe)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.2263411 1.7505500 1.4681621 1.1937658 1.10080036
## Proportion of Variance 0.3540425 0.2188875 0.1539643 0.1017912 0.08655439
## Cumulative Proportion 0.3540425 0.5729300 0.7268943 0.8286855 0.91523989
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.83819193 0.62618224 0.189216588 0.175270145
## Proportion of Variance 0.05018326 0.02800744 0.002557351 0.002194259
## Cumulative Proportion 0.96542316 0.99343060 0.995987951 0.998182210
## Comp.10 Comp.11 Comp.12 Comp.13
## Standard deviation 0.138212469 0.0673578705 3.055440e-02 0.0234044187
## Proportion of Variance 0.001364478 0.0003240773 6.668368e-05 0.0000391262
## Cumulative Proportion 0.999546688 0.9998707651 9.999374e-01 0.9999765749
## Comp.14
## Standard deviation 1.810941e-02
## Proportion of Variance 2.342505e-05
## Cumulative Proportion 1.000000e+00
#loadings are basically eigenvectors
loadings(pca_Betsafe)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1x2_odd1 0.388 0.242 0.164
## 1x2_odd2 -0.420 0.188 -0.110
## 1x2_oddX 0.551 0.573
## ah_1 -0.316 0.625 0.146 0.120 -0.686
## ah_2 0.392 -0.536 -0.217 -0.714
## bts_NO 0.157 -0.563 -0.358 0.107 -0.701
## bts_YES -0.166 0.568 0.335 -0.682
## dc_12 -0.559 0.771
## dc_1X 0.391 0.255 0.124 0.110
## dc_X2 -0.418 0.204 0.213
## ha_1 0.379 0.268 0.155
## ha_2 -0.413 0.214
## ou_over 0.152 -0.737 0.646 0.107
## ou_under 0.170 -0.164 0.619 0.738
## Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## 1x2_odd1 0.332 0.393 0.343 0.598
## 1x2_odd2 0.467 -0.212 -0.638 -0.191 0.248
## 1x2_oddX -0.464 -0.152 -0.244 0.208
## ah_1
## ah_2
## bts_NO 0.105
## bts_YES -0.225
## dc_12 0.281
## dc_1X 0.145 -0.590 -0.133 0.469 -0.364
## dc_X2 0.524 -0.197 0.485 -0.424
## ha_1 0.293 0.337 -0.574 -0.466
## ha_2 0.466 -0.158 0.723
## ou_over
## ou_under
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.071 0.071 0.071 0.071 0.071 0.071 0.071 0.071
## Cumulative Var 0.071 0.143 0.214 0.286 0.357 0.429 0.500 0.571
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.071 0.071 0.071 0.071 0.071 0.071
## Cumulative Var 0.643 0.714 0.786 0.857 0.929 1.000
plot(pca_Betsafe)
screeplot(pca_Betsafe,type = "line", main = "Screen plot of pca of Betsafe")
plot(pca_Betsafe$scores,col=1+merged_Betsafe_odds_final_less$IsOver)
#scores are the mapped points in this plot
The loading results can be seen here. The weights of each variable for all components are shown. It can be seen that component 1 has the highest varience, and component 2 is next. Higher varience means the data fits better with the line. So component 1 and component 2 are chosen by PCA. The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V. There less data points compared to previous bookmakers.
TASK 1 PART b
MDS will be used instead of PAR in part b. So most of the coding stays the same, only the MDS codings will be wtitten instead of PAR part. Manhattan distance is a generalization of the Euclidian distance which takes mutual correlations into account. The calculations are done for 5 bookmakers and the results are plotted.
#for pinnacle
#Eucledian Distance for pinnacle
D_eucledian_pinnacle = dist(vars_merged_pinnacle_odds_final, method = "euclidean")
MDS_eucledian_pinnacle = cmdscale(D_eucledian_pinnacle)
plot(MDS_eucledian_pinnacle[, 1], MDS_eucledian_pinnacle[, 2], pch = 21, bg = c("red", "green", "blue"))
#Manhattan Distance for pinnacle
D_manhattan_pinnacle = dist(vars_merged_pinnacle_odds_final, "manhattan")
MDS_manhattan_pinnacle = cmdscale(D_manhattan_pinnacle)
plot(MDS_manhattan_pinnacle[, 1], MDS_manhattan_pinnacle[, 2], pch = 23, bg = c("red", "green", "blue"))
Eucledian and Manhattan distances give similar results to each other. Both of the results have V shape formation. Eucledian has normal V shape formation, while the Manhattan has a upside down V shaped formation. Eucledian has dta points more closed to each other compared to Manhattan. Both methods are linearly inseperable.
#for bet365
#Eucledian Distance for bet365
D_eucledian_bet365 = dist(vars_merged_bet365_odds_final, method = "euclidean")
MDS_eucledian_bet365 = cmdscale(D_eucledian_bet365)
plot(MDS_eucledian_bet365[, 1], MDS_eucledian_bet365[, 2], pch = 21, bg = c("red", "green", "blue"))
#Manhattan Distance for bet365
D_manhattan_bet365 = dist(vars_merged_bet365_odds_final, "manhattan")
MDS_manhattan_bet365 = cmdscale(D_manhattan_bet365)
plot(MDS_manhattan_bet365[, 1], MDS_manhattan_bet365[, 2], pch = 23, bg = c("red", "green", "blue"))
The bookmaker bet365 gives results such that eucledian has linear data lines parallel to each other but still the data is not linearly seperable. Manhattan has its data points more compact and close to each other and Manhattan also gives data points that are not linearly seperable.
#for Betway
#Eucledian Distance for Betway
D_eucledian_Betway = dist(vars_merged_Betway_odds_final, method = "euclidean")
MDS_eucledian_Betway = cmdscale(D_eucledian_Betway)
plot(MDS_eucledian_Betway[, 1], MDS_eucledian_Betway[, 2], pch = 21, bg = c("red", "green", "blue"))
#Manhattan Distance for Betway
D_manhattan_Betway = dist(vars_merged_Betway_odds_final, "manhattan")
MDS_manhattan_Betway = cmdscale(D_manhattan_Betway)
plot(MDS_manhattan_Betway[, 1], MDS_manhattan_Betway[, 2], pch = 23, bg = c("red", "green", "blue"))
Eucledian has its data points close to each other with parallel lines and it gives data points that are not linearly seperable. The area that data is cumulated around is 0 axises of x and y. Manhattan has linear data lines parallel to each other but still the data is not linearly seperable. The shapes of Betway results is like the opposite of bet365 for Manhattan and Eucledian.
#for Betsson
#Eucledian Distance for Betsson
D_eucledian_Betsson = dist(vars_merged_Betsson_odds_final, method = "euclidean")
MDS_eucledian_Betsson = cmdscale(D_eucledian_Betsson)
plot(MDS_eucledian_Betsson[, 1], MDS_eucledian_Betsson[, 2], pch = 21, bg = c("red", "green", "blue"))
#Manhattan Distance for Betsson
D_manhattan_Betsson = dist(vars_merged_Betsson_odds_final, "manhattan")
MDS_manhattan_Betsson = cmdscale(D_manhattan_Betsson)
plot(MDS_manhattan_Betsson[, 1], MDS_manhattan_Betsson[, 2], pch = 23, bg = c("red", "green", "blue"))
For Betsson, the data points are spreaded around and doesnot have a particular shape for both Eucledian and Manhattan. The number of data points are less than previous bookmakers. This may be the reason of the randomness of the data, maybe more data could have a more distinct shape. In Eucledian distances som of the data points cumulate around the line from the zero value of y axis, this shape could have been clearer with more data.
#for Betsafe
#Eucledian Distance for Betsafe
D_eucledian_Betsafe = dist(vars_merged_Betsafe_odds_final, method = "euclidean")
MDS_eucledian_Betsafe = cmdscale(D_eucledian_Betsafe)
plot(MDS_eucledian_Betsafe[, 1], MDS_eucledian_Betsafe[, 2], pch = 21, bg = c("red", "green", "blue"))
#Manhattan Distance for Betsafe
D_manhattan_Betsafe = dist(vars_merged_Betsafe_odds_final, "manhattan")
MDS_manhattan_Betsafe = cmdscale(D_manhattan_Betsafe)
plot(MDS_manhattan_Betsafe[, 1], MDS_manhattan_Betsafe[, 2], pch = 23, bg = c("red", "green", "blue"))
For Betsafe, the data points are spreaded around and doesnot have a particular shape for both Eucledian and Manhattan. The number of data points are less than previous bookmakers. This may be the reason of the randomness of the data, maybe more data could have a more distinct shape. In Eucledian distances som of the data points cumulate around the line from the zero value of y axis, this shape could have been clearer with more data. The results are very similar to the results of Betsson.
TASK 1 PART C
Both PCA and MDS gave linearly inseperable results. All of the bookmakers had V shape formations and similar results to each other with PCA, while the MDS gave more variety of results. It can be said that the PCA had more consistent results compared to MDS results. Some of the MDS results had V shape formation, some of them had parallel lines, some of them were random and some of them was cumulated around a line shape; which shows the MDS did not give similar results for different bookmakers. Also Eucleian and Manhattan had different shape formations for same bookmaker.
TASK 2
Home Tie Away will be observed. since the variables are assumed to be same, most of the code stays same for task 2. Only change is that match outcome is checked instead of over/under2.5.
The codes are written for pinnacle. The Home Tie Away results are changed into numbers to plot them using pca. If the result of the match is home, the Home-Away-Tie result is numbered as 1. If the result of the match is away, the Home-Away-Tie result is numbered as 2.If the result of the match is tie, the Home-Away-Tie result is numbered as 3.
#for pinnacle
#Home, Away and Tie are created by comparing the number of goals and adding the results to the 'merged_pinnacle_odds_final' as TRUE FALSE values.
merged_pinnacle_odds_final[,IsHome := HomeGoals > AwayGoals]
merged_pinnacle_odds_final[,IsAway := HomeGoals < AwayGoals]
merged_pinnacle_odds_final[,IsTie := HomeGoals == AwayGoals]
#HomeTieAway(written as H_A_T in the codes )result is changed to numbers.
#HomeTieAway(written as H_A_T in the codes )result is changed to numbers.
#If the result of the match is home, the Home-Away-Tie result is numbered as 1.
#If the result of the match is away, the Home-Away-Tie result is numbered as 2.
#If the result of the match is tie, the Home-Away-Tie result is numbered as 3.
merged_pinnacle_odds_final[,HATresult:=1*IsHome+2*IsAway+3*IsTie]
#pinnacle home-away-tie outcome;
plot(pca_pinnacle$scores,col=1+merged_pinnacle_odds_final$HATresult)
#scores are the mapped points in this plot
The plot shows that the data is not linearly seperable. But the data has some shape formation similar to shape of U. The data is mostly around o of the x and y axises. The data is not all together around the U shape, some of the data is spreaded aroud. The shape is very similar tho the PCA result of pinnacle with over-under comparison.
The codes are written for bet365.
#for bet365
#Home, Away and Tie are created by comparing the number of goals and adding the results to the 'merged_bet365_odds_final' as TRUE FALSE values.
merged_bet365_odds_final[,IsHome := HomeGoals > AwayGoals]
merged_bet365_odds_final[,IsAway := HomeGoals < AwayGoals]
merged_bet365_odds_final[,IsTie := HomeGoals == AwayGoals]
#HomeTieAway(written as H_A_T in the codes )result is changed to numbers.
merged_bet365_odds_final[,HATresult:=1*IsHome+2*IsAway+3*IsTie]
#bet365 home-away-tie outcome;
plot(pca_bet365$scores,col=1+merged_bet365_odds_final$HATresult)
#scores are the mapped points in this plot
The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V. The shape is very similar tho the PCA result with over-under comparison.
The codes are written for Betway.
#for Betway
#Home, Away and Tie are created by comparing the number of goals and adding the results to the 'merged_Betway_odds_final' as TRUE FALSE values.
merged_Betway_odds_final[,IsHome := HomeGoals > AwayGoals]
merged_Betway_odds_final[,IsAway := HomeGoals < AwayGoals]
merged_Betway_odds_final[,IsTie := HomeGoals == AwayGoals]
merged_Betway_odds_final[,HATresult:=1*IsHome+2*IsAway+3*IsTie]
#Betway home-away-tie outcome;
plot(pca_Betway$scores,col=1+merged_Betway_odds_final$HATresult)
#scores are the mapped points in this plot
The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V. The shape is very similar tho the PCA result with over-under comparison.
The codes are written for Betsson.
#for Betsson
#Home, Away and Tie are created by comparing the number of goals and adding the results to the 'merged_Betsson_odds_final' as TRUE FALSE values.
merged_Betsson_odds_final[,IsHome := HomeGoals > AwayGoals]
merged_Betsson_odds_final[,IsAway := HomeGoals < AwayGoals]
merged_Betsson_odds_final[,IsTie := HomeGoals == AwayGoals]
#HomeTieAway(written as H_A_T in the codes )result is changed to numbers.
merged_Betsson_odds_final[,HATresult:=1*IsHome+2*IsAway+3*IsTie]
#Betsson home-away-tie outcome;
plot(pca_Betsson$scores,col=1+merged_Betsson_odds_final$HATresult)
#scores are the mapped points in this plot
The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V. There less data points compared to previous bookmakers.The shape is very similar tho the PCA result with over-under comparison.
The codes are written for Betsafe.
#for Betsafe
#Home, Away and Tie are created by comparing the number of goals and adding the results to the 'merged_Betsafe_odds_final' as TRUE FALSE values.
merged_Betsafe_odds_final[,IsHome := HomeGoals > AwayGoals]
merged_Betsafe_odds_final[,IsAway := HomeGoals < AwayGoals]
merged_Betsafe_odds_final[,IsTie := HomeGoals == AwayGoals]
#HomeTieAway(written as H_A_T in the codes )result is changed to numbers.
merged_Betsafe_odds_final[,HATresult:=1*IsHome+2*IsAway+3*IsTie]
#Betsafe home-away-tie outcome;
plot(pca_Betsafe$scores,col=1+merged_Betsafe_odds_final$HATresult)
#scores are the mapped points in this plot
The plot shows that the data is not linearly seperable. But the data has a V shape formation, which means the data points are cumulated together forming a shape similar to V. There less data points compared to previous bookmakers except Betsson which also hass less data points.The shape is very similar tho the PCA result with over-under comparison.
TASK 3
#install.packages("jpeg")
#install.packages("ggplot2")
require(jpeg)
## Loading required package: jpeg
require(ggplot2)
## Loading required package: ggplot2
ggplots can be used since their color scales are better than R’s. TASK 3 PART 1
#part 1 : reading image as a variable
imagegizem<-readJPEG("C:/Users/Gizem/Desktop/ie582datamning/gizem_512_ie582hw2.jpeg")
TASK 3 PART 2
The comments on this task is less compared to previous tasks, since it is not required.
#structure of the variable
str(imagegizem)
## num [1:512, 1:512, 1:3] 0.682 0.678 0.678 0.678 0.678 ...
#the result will be obtained as; num [1:512, 1:512, 1:3] 0.682 0.678 0.678 0.678 0.678 ...
The result of the structure will be obtained as; num [1:512, 1:512, 1:3] 0.682 0.678 0.678 0.678 0.678 …
#dimension
dim(imagegizem)
## [1] 512 512 3
# the result will be obtained as; 512 512 3
The dimensions are 512x512x3 for this picture.
# a ) display the image with rasterimage
plot(0:100,0:100,type="n",ann=FALSE,axes=FALSE)
rasterImage(imagegizem,0,0,100,100)
#b ) display each channel
#use image function to have 3 pictures in one plot
par(mfrow=c(1,3))
image(imagegizem[,,1])
image(imagegizem[,,2])
image(imagegizem[,,3])
Each channel is shown next to each other and small differences can be observed.
red<-imagegizem
green<-imagegizem
blue<-imagegizem
red[,,2:3]=0
green[,,1]=0
green[,,3]=0
blue[,,1:2]=0
# to plot these 3 images in one plot;
par(mfrow=c(1,3))
plot(0:100,0:100,type="n",ann=FALSE,axes=FALSE)
rasterImage(red,0,0,100,100)
plot(0:100,0:100,type="n",ann=FALSE,axes=FALSE)
rasterImage(green,0,0,100,100)
plot(0:100,0:100,type="n",ann=FALSE,axes=FALSE)
rasterImage(blue,0,0,100,100)
These 3 pictures should have colors of red, green and blue. The channels are these 3 main colors in here.
TASK 3 PART 3
# part 3 (of task 3)
#adding noise
noise=runif(512*512,0,0.1)
new_noised_imagegizem=imagegizem
new_noised_imagegizem[,,1]=new_noised_imagegizem[,,1]+noise
for(k in 1:512) for(l in 1:512)
if (new_noised_imagegizem[k,l,1]>1) new_noised_imagegizem[k,l,1]=1
noise=runif(512*512,0,0.1)
new_noised_imagegizem[,,2]=new_noised_imagegizem[,,2]+noise
for(k in 1:512) for(l in 1:512)
if (new_noised_imagegizem[k,l,2]>1) new_noised_imagegizem[k,l,2]=1
noise=runif(512*512,0,0.1)
new_noised_imagegizem[,,3]=new_noised_imagegizem[,,3]+noise
for(k in 1:512) for(l in 1:512)
if (new_noised_imagegizem[k,l,3]>1) new_noised_imagegizem[k,l,3]=1
#plot these together
par(mfrow=c(1,2))
#plot the new image
#this is the noisy image
plot(0:100,0:100,type="n",ann=FALSE,axes=FALSE)
rasterImage(new_noised_imagegizem,0,0,100,100)
#plot the original image to see the differences
plot(0:100,0:100,type="n",ann=FALSE,axes=FALSE)
rasterImage(imagegizem,0,0,100,100)
The difference is very minimal as shades, it may be so since the noise is so low between 0 and 0.1. But it is clear that the first picture (the one on the left) has noise, when we look at the left side of the wall behind the face on the picture.
#displaying each channel seperately with noises
par(mfrow=c(1,3))
image(new_noised_imagegizem[,,1])
image(new_noised_imagegizem[,,2])
image(new_noised_imagegizem[,,3])
The channels of the noisy picture are shown in here and the difference between them is clear.
TASK 4
Grayscaling is done with another program and the noisy gray picture is taken to R.
#TASK 4
gray_noisy_imagegizem<-readJPEG("C:/Users/Gizem/Desktop/ie582datamning/noisygizemm.jpg")
plot(0:100,0:100,type="n",ann=FALSE,axes=FALSE)
rasterImage(gray_noisy_imagegizem,0,0,100,100)
When the slidiing is done the 10x10 matris would have been turned into 8x8 matrix as a more blurry picture since each pizxel would have had the average of the color data around it (3x3) in the 10*10 matrix. So the colors of the picels of 8x8 would be more similar to the colors of 10x10 matrix. These were my expectations.